knitr::opts_chunk$set(warning = FALSE)
source(here::here('code', 'helpers.R'))
library(tidyverse)
library(forcats)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
library(agricolae)
library(ggupset)
library(RColorBrewer)
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## Registered S3 methods overwritten by 'vegan':
## method from
## plot.rda klaR
## predict.rda klaR
## print.rda klaR
## This is vegan 2.6-4
library(pheatmap)
library(ggradar)
library(broom)
data <- targets::tar_read(merged_all_results)
truth <- targets::tar_read(truth_set_data)
table(data$Type)
##
## BLAST100 BLAST97 CustomNBC DADA2Spec DADA2Tax Kraken_0.0
## 13912 21093 155700 157104 157104 104880
## Kraken_0.05 Kraken_0.1 Metabuli MMSeqs2_100 MMSeqs2_97 Mothur
## 104880 104880 73217 109368 109368 152892
## Qiime2 VSEARCH
## 23275 157104
Let’s remove the >0.2 Kraken runs, those are too strict
data <- data |> filter(!Type %in% c('Kraken_0.2', 'Kraken_0.3', 'Kraken_0.4', 'Kraken_0.5', 'Kraken_0.6', 'Kraken_0.7', 'Kraken_0.8', 'Kraken_0.9'))
Check whether any of the data is missing
type_list <- data |># filter(!str_detect(Subject, 'c01')) |>
group_by(Type) |>
summarize('Subject' = list(unique(Subject)))
all_subjects <- unique(data |># filter(!str_detect(Subject, 'c01')) |>
pull(Subject))
for(i in type_list$Type){
this_subjects <- type_list |> filter(Type == i) |> pull(Subject)
missing <- setdiff(all_subjects, this_subjects[[1]])
if (length(missing) > 0){
print(i)
print(missing)
}
}
## [1] "BLAST100"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "BLAST97"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "CustomNBC"
## [1] "c01_v03_HmmCut.fasta"
## [1] "MMSeqs2_100"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "MMSeqs2_97"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "Metabuli"
## [1] "c01_v03_HmmCut.fasta"
## [1] "Mothur"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "Qiime2"
## [1] "c01_v03_HmmCut.fasta"
OK, we can ignore the HmmCut ones.
data |> write_tsv('./results/cleaned_and_filtered_data.tsv.gz')
names(table(data$Query))
## [1] "KWest_16S_PooledTrue.fa"
## [2] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa"
## [3] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa"
## [4] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa"
## [5] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa"
## [6] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa"
## [7] "make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa"
## [8] "make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa"
## [9] "make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa"
## [10] "make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa"
## [11] "make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa"
## [12] "make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa"
## [13] "make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa"
## [14] "make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa"
table(data$Subject)
##
## 12s_v010_final.fasta
## 16830
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 16415
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 16154
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 16158
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 16252
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 16329
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 16443
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 16095
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 16206
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 16501
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 16287
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 16606
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 16516
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 16410
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 16576
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 16315
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 16537
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 16225
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 16388
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 16345
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 16522
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 15956
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 15695
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 16021
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 16039
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 15821
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 15944
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 15793
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 15829
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 15823
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 15866
## 12S_v10_HmmCut.fasta
## 8714
## 16S_v04_final.fasta
## 17914
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 16614
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 16600
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 16975
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 16807
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 17016
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 16929
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 16570
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 17081
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 16405
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 16525
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 17290
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 17107
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 17405
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 17070
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 17285
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 17609
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 17193
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 17344
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 17502
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 17399
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 16412
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 16567
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 16534
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 16431
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 16675
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 16414
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 16223
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 16385
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 16250
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 16358
## 16S_v04_HmmCut.fasta
## 9590
## c01_v03_final.fasta
## 12849
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 12526
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 13147
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 12573
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 13159
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 12347
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 12512
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 12463
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 12913
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 12469
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 13165
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 12695
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 12623
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 12596
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 12574
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 12660
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 12686
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 12569
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 12659
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 12812
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 12739
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 13229
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 13188
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 13049
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 12293
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 12250
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 12753
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 13043
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 12881
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 12231
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 12242
## c01_v03_HmmCut.fasta
## 6792
table(data$Subject)
##
## 12s_v010_final.fasta
## 16830
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 16415
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 16154
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 16158
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 16252
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 16329
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 16443
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 16095
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 16206
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 16501
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 16287
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 16606
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 16516
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 16410
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 16576
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 16315
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 16537
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 16225
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 16388
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 16345
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 16522
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 15956
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 15695
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 16021
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 16039
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 15821
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 15944
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 15793
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 15829
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 15823
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 15866
## 12S_v10_HmmCut.fasta
## 8714
## 16S_v04_final.fasta
## 17914
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 16614
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 16600
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 16975
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 16807
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 17016
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 16929
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 16570
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 17081
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 16405
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 16525
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 17290
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 17107
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 17405
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 17070
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 17285
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 17609
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 17193
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 17344
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 17502
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 17399
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 16412
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 16567
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 16534
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 16431
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 16675
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 16414
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 16223
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 16385
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 16250
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 16358
## 16S_v04_HmmCut.fasta
## 9590
## c01_v03_final.fasta
## 12849
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 12526
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 13147
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 12573
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 13159
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 12347
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 12512
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 12463
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 12913
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 12469
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 13165
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 12695
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 12623
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 12596
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 12574
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 12660
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 12686
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 12569
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 12659
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 12812
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 12739
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 13229
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 13188
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 13049
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 12293
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 12250
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 12753
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 13043
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 12881
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 12231
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 12242
## c01_v03_HmmCut.fasta
## 6792
twelveS_data <- data |> filter(Subject == '12s_v010_final.fasta')
sixteenS_data <- data |> filter(Subject == '16S_v04_final.fasta')
co1_data <- data |> filter(Subject == 'c01_v03_final.fasta')
table(twelveS_data$Query)
##
## KWest_16S_PooledTrue.fa
## 4263
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa
## 1321
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa
## 1321
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa
## 1095
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa
## 1095
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 1068
## make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa
## 990
## make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa
## 1000
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa
## 316
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa
## 299
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 281
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa
## 1343
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa
## 1226
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 1212
table(sixteenS_data$Query)
##
## KWest_16S_PooledTrue.fa
## 4928
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa
## 1230
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa
## 1194
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa
## 1306
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa
## 1306
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 1068
## make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa
## 990
## make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa
## 1000
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa
## 307
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa
## 341
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 281
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa
## 1274
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa
## 1477
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 1212
table(sixteenS_data$Subject)
##
## 16S_v04_final.fasta
## 17914
table(co1_data$Query)
##
## KWest_16S_PooledTrue.fa
## 2768
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa
## 795
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa
## 795
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa
## 799
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa
## 799
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 1315
## make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa
## 792
## make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa
## 800
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa
## 192
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa
## 216
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 346
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa
## 818
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa
## 899
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa
## 1515
twelveS_data_vs_12S_100 <- twelveS_data |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa')
sixteenS_data_vs_16S_100 <- sixteenS_data |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa' )
co1_data_vs_co1_100 <- co1_data |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa')
twelveS_data_vs_12S_100 |> select(Type, species) |> filter(species != 'dropped' &
!is.na(species)) |>
group_by(Type) |> count(species) |> summarise(n = n()) |>
ggplot(aes(x = Type, y = n, fill = n)) + geom_col() + coord_flip() +
theme_minimal() +
ylab('Count') +
ggtitle('12S: Species-level hits per classifier')
twelveS_data_vs_12S_100 |> select(Type, genus) |> filter(genus != 'dropped' &
!is.na(genus)) |>
group_by(Type) |> count(genus) |> summarise(n = n()) |>
ggplot(aes(x = Type, y = n, fill = n)) + geom_col() + coord_flip() +
theme_minimal() +
ylab('Count') +
ggtitle('12S: Genus-level hits per classifier')
twelveS_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
head(twelveS_truth)
## # A tibble: 6 × 3
## True_OTU True_family True_species
## <chr> <chr> <chr>
## 1 ASV_1 Syngnathidae Phyllopteryx taeniolatus
## 2 ASV_2 Carcharhinidae Glyphis garricki
## 3 ASV_3 Mullidae Parupeneus barberinus
## 4 ASV_4 Holocentridae Myripristis vittata
## 5 ASV_5 Scincidae Tropidophorus hainanus
## 6 ASV_6 Anatidae Aythya nyroca
twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
mutate(Correct = True_species == species) |>
filter(species != 'dropped' & !is.na(species)) |>
group_by(Type) |> count(Correct) |>
ggplot(aes(x = fct_rev(fct_reorder2(Type, Correct, n)), fill = Correct, y = n))+ geom_col() +
coord_flip() + theme_minimal() + xlab('Type') +
ggtitle('12S: Correct and incorrect species-level classifications (absolute)') +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7"))
cols <- c('Correct species' = "#009E73", 'Correct genus'="#56B4E9", 'Correct family' = "#0072B2", 'Incorrect family' = "#E69F00", 'Incorrect genus'="#F0E442", 'Incorrect species'="#D55E00", 'No hit'= "#CC79A7")
twelve_s_relative_plot <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type) |>
count(CorrectSpecies) |>
mutate(total = sum(n, na.rm=TRUE)) |>
mutate(missing = 99 - total) |>
group_modify(~ add_row(.x)) |>
group_modify(~ {
.x |> mutate(new_col= max(missing, na.rm=TRUE)) |>
mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
TRUE ~ n)) |>
select(-new_col)
} ) |>
mutate(total = 99) |>
mutate(perc = n / total * 100) |>
mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |>
mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |>
ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+
geom_col() +
coord_flip() +
theme_minimal() +
ylab('Percentage') + xlab('Type') +
ggtitle('12S: Correct and incorrect species-level classifications (relative)') +
scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
twelve_s_relative_plot
### Calculate Upset-based species sightings
type_list <- twelveS_data_vs_12S_100 |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |>
group_by(species) |>
summarize('Type' = list(Type))
a <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('12S: Shared species') +
ylab('Species')
a
type_list <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
filter(species == True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
b <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('12S: Shared correct species') +
ylab('Species')
b
type_list <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
filter(species != True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
c <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('12S: Shared incorrect species') +
ylab('Species')
c
a + b + c & ylim(c(0, 30)) &
theme(
# Hide panel borders and remove grid lines
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
#panel.grid.major.y = element_line(),
# Change axis line
axis.line = element_line(colour = "black")
)
add_scores <- function(twelveS_data_vs_12S_100_with_MaxTruth, twelveS_truth ) {
twelveS_data_vs_12S_100_with_MaxTruth|> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 99 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums))
}
scores <- add_scores(twelveS_data_vs_12S_100, twelveS_truth)
scores <- scores |> rowwise() |> mutate(Recall = recall(TP, FN), Precision = precision(TP, FP),
f1 = f1(Precision, Recall), f0.5 = f0.5(Precision, Recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 14 × 10
## # Rowwise:
## Type TP FP TN FN Recall Precision f1 f0.5 accuracy
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BLAST100 59 6 0 34 0.634 0.908 0.747 0.836 0.596
## 2 BLAST97 48 8 0 43 0.527 0.857 0.653 0.762 0.485
## 3 CustomNBC 42 19 0 38 0.525 0.689 0.596 0.648 0.424
## 4 DADA2Spec 59 6 0 34 0.634 0.908 0.747 0.836 0.596
## 5 DADA2Tax 12 16 0 71 0.145 0.429 0.216 0.308 0.121
## 6 Kraken_0.0 54 14 0 31 0.635 0.794 0.706 0.756 0.545
## 7 Kraken_0.05 51 7 0 41 0.554 0.879 0.68 0.787 0.515
## 8 Kraken_0.1 47 5 0 47 0.5 0.904 0.644 0.778 0.475
## 9 MMSeqs2_100 59 6 0 34 0.634 0.908 0.747 0.836 0.596
## 10 MMSeqs2_97 59 13 0 27 0.686 0.819 0.747 0.789 0.596
## 11 Metabuli 56 13 0 30 0.651 0.812 0.723 0.773 0.566
## 12 Mothur 41 20 0 38 0.519 0.672 0.586 0.635 0.414
## 13 Qiime2 29 22 0 48 0.377 0.569 0.453 0.516 0.293
## 14 VSEARCH 40 15 0 44 0.476 0.727 0.576 0.658 0.404
twelveS_scoreS_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |> ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1.05)) + theme_minimal_hgrid()+ theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + ggtitle('12S scores')
twelveS_scoreS_plot
scores |> select(-c(TP, FP, TN, FN)) |>
rename('group' = 'Type') |>
ggradar()
Let’s also make a heatmap from that
b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, Recall, Precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
table(twelveS_data_vs_12S_100$Type)
##
## BLAST100 BLAST97 CustomNBC DADA2Spec DADA2Tax Kraken_0.0
## 86 95 99 99 99 99
## Kraken_0.05 Kraken_0.1 Metabuli MMSeqs2_100 MMSeqs2_97 Mothur
## 99 99 99 99 99 99
## Qiime2 VSEARCH
## 51 99
First, we count the per-OTU species hits
twelveS_data_vs_12S_100_maxCount <- twelveS_data_vs_12S_100 |>
mutate(species = na_if(species, 'dropped')) |>
filter(!is.na(species)) |>
#filter(! Type %in% c('Mothur', 'VSEARCH', 'Kraken_0.2', 'Mothur', 'Metabuli', 'NBC', 'BLAST97', 'Kraken_0.0', 'Kraken_0.1')) |>
group_by(Query, Subject, OTU) |>
count(species) |>
# double check the truth
#left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
#mutate(Truth = True_species == species) |>
# pull out the per-group highest n
filter( n > 4) |>
slice_max(n, n=1, with_ties = FALSE) |>
mutate(Type = 'MaxCount', .before = 'Query') |>
select(-n)
twelveS_data_vs_12S_100_maxCount
## # A tibble: 70 × 5
## # Groups: Query, Subject, OTU [70]
## Type Query Subject OTU species
## <chr> <chr> <chr> <chr> <chr>
## 1 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_1 Phyllo…
## 2 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Cirrip…
## 3 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Sterco…
## 4 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Carcha…
## 5 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Hemigy…
## 6 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Ctenoc…
## 7 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Daptio…
## 8 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Engrau…
## 9 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Bathyr…
## 10 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Tripho…
## # ℹ 60 more rows
twelveS_data_vs_12S_100_with_MaxTruth <- twelveS_data_vs_12S_100 |>
bind_rows(twelveS_data_vs_12S_100_maxCount) #|>
#filter(! Type %in% c('Mothur', 'VSEARCH', 'Kraken_0.2', 'Mothur', 'Metabuli', 'NBC', 'BLAST97', 'Kraken_0.0', 'Kraken_0.1'))
maxTruth_scores <- add_scores(twelveS_data_vs_12S_100_with_MaxTruth, twelveS_truth )
maxTruth_scores <- maxTruth_scores |> rowwise() |> mutate(Recall = recall(TP, FN), Precision = precision(TP, FP),
f1 = f1(Precision, Recall), f0.5 = f0.5(Precision, Recall), accuracy = accuracy(TP, FP, FN, TN))
maxTruth_scoreS_plot <- maxTruth_scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |> ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1)) + theme_minimal_hgrid()+ theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + geom_point() + ggtitle('12S scores')
maxTruth_scoreS_plot
Interestingly, just counting the labels is not good! It performs worse
than BLAST.
sixteenS_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
head(sixteenS_truth)
## # A tibble: 6 × 3
## True_OTU True_family True_species
## <chr> <chr> <chr>
## 1 ASV_1 Syngnathidae Phyllopteryx taeniolatus
## 2 ASV_2 Carcharhinidae Glyphis garricki
## 3 ASV_3 Merlucciidae Merluccius productus
## 4 ASV_4 Mullidae Parupeneus barberinus
## 5 ASV_5 Syngnathidae Hippocampus algiricus
## 6 ASV_6 Eleotridae Bostrychus sinensis
sixteenS_relative_plot <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type) |>
count(CorrectSpecies) |>
mutate(total = sum(n)) |>
mutate(missing = 99 - total) |>
group_modify(~ add_row(.x)) |>
group_modify(~ {
.x |> mutate(new_col= max(missing, na.rm=TRUE)) |>
mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
TRUE ~ n)) |>
select(-new_col)
} ) |>
mutate(total = 99) |>
mutate(perc = n / total * 100) |>
mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |>
mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |>
tidyr::complete(CorrectSpecies, fill = list(n=0, total = 99, missing = NA, perc = 0)) |>
ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+
geom_col() +
coord_flip() +
theme_minimal() +
ylab('Percentage') + xlab('Type') +
ggtitle('16S: Correct and incorrect species-level classifications (relative)') +
scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
sixteenS_relative_plot
scores <- add_scores(sixteenS_data_vs_16S_100, sixteenS_truth)
scores
## # A tibble: 14 × 5
## Type TP FP TN FN
## <chr> <int> <int> <int> <dbl>
## 1 BLAST100 51 0 0 48
## 2 BLAST97 48 5 0 46
## 3 CustomNBC 52 10 0 37
## 4 DADA2Spec 50 1 0 48
## 5 DADA2Tax 21 30 0 48
## 6 Kraken_0.0 52 15 0 32
## 7 Kraken_0.05 48 13 0 38
## 8 Kraken_0.1 45 10 0 44
## 9 MMSeqs2_100 51 0 0 48
## 10 MMSeqs2_97 60 5 0 34
## 11 Metabuli 53 18 0 28
## 12 Mothur 50 14 0 35
## 13 Qiime2 42 15 0 42
## 14 VSEARCH 43 12 0 44
scores <- scores |> rowwise() |> mutate(Recall = recall(TP, FN), Precision = precision(TP, FP),
f1 = f1(Precision, Recall), f0.5 = f0.5(Precision, Recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 14 × 10
## # Rowwise:
## Type TP FP TN FN Recall Precision f1 f0.5 accuracy
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BLAST100 51 0 0 48 0.515 1 0.68 0.842 0.515
## 2 BLAST97 48 5 0 46 0.511 0.906 0.653 0.784 0.485
## 3 CustomNBC 52 10 0 37 0.584 0.839 0.689 0.772 0.525
## 4 DADA2Spec 50 1 0 48 0.510 0.980 0.671 0.828 0.505
## 5 DADA2Tax 21 30 0 48 0.304 0.412 0.35 0.385 0.212
## 6 Kraken_0.0 52 15 0 32 0.619 0.776 0.689 0.739 0.525
## 7 Kraken_0.05 48 13 0 38 0.558 0.787 0.653 0.727 0.485
## 8 Kraken_0.1 45 10 0 44 0.506 0.818 0.625 0.728 0.455
## 9 MMSeqs2_100 51 0 0 48 0.515 1 0.68 0.842 0.515
## 10 MMSeqs2_97 60 5 0 34 0.638 0.923 0.755 0.847 0.606
## 11 Metabuli 53 18 0 28 0.654 0.746 0.697 0.726 0.535
## 12 Mothur 50 14 0 35 0.588 0.781 0.671 0.733 0.505
## 13 Qiime2 42 15 0 42 0.5 0.737 0.596 0.673 0.424
## 14 VSEARCH 43 12 0 44 0.494 0.782 0.606 0.700 0.434
sixteenS_score_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |> ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1)) + theme_minimal_hgrid() + theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + ggtitle('16S scores')
sixteenS_score_plot
b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, Recall, Precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
### Calculate Upset-based species sightings
type_list <- sixteenS_data_vs_16S_100 |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |>
group_by(species) |>
summarize('Type' = list(Type))
a <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('16S: Shared species') +
ylab('Species')
a
type_list <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
filter(species == True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
b <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('16S: Shared correct species') +
ylab('Species')
b
type_list <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
filter(species != True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
c <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('16S: Shared incorrect species') +
ylab('Species')
c
a + b + c & ylim(c(0, 20)) &
theme(
# Hide panel borders and remove grid lines
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
#panel.grid.major.y = element_line(),
# Change axis line
axis.line = element_line(colour = "black")
)
CO1_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
head(CO1_truth)
## # A tibble: 6 × 3
## True_OTU True_family True_species
## <chr> <chr> <chr>
## 1 ASV_1 Syngnathidae Phycodurus eques
## 2 ASV_2 Syngnathidae Phyllopteryx taeniolatus
## 3 ASV_3 Carcharhinidae Glyphis garricki
## 4 ASV_4 Syngnathidae dropped
## 5 ASV_5 Mullidae Parupeneus heptacantha
## 6 ASV_6 Otariidae Zalophus wollebaeki
CO1_relative_plot <- co1_data_vs_co1_100 |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type) |>
count(CorrectSpecies) |>
mutate(total = sum(n)) |>
mutate(missing = 100 - total) |>
group_modify(~ add_row(.x)) |>
group_modify(~ {
.x |> mutate(new_col= max(missing, na.rm=TRUE)) |>
mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
TRUE ~ n)) |>
select(-new_col)
} ) |>
mutate(total = 99) |>
mutate(perc = n / total * 100) |>
mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |>
mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |>
tidyr::complete(CorrectSpecies, fill = list(n=0, total = 99, missing = NA, perc = 0)) |>
ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+
geom_col() +
coord_flip() +
theme_minimal() +
ylab('Percentage') + xlab('Type') +
ggtitle('CO1: Correct and incorrect species-level classifications (relative)') +
scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
CO1_relative_plot
scores <- add_scores(co1_data_vs_co1_100, CO1_truth)
scores
## # A tibble: 14 × 5
## Type TP FP TN FN
## <chr> <int> <int> <int> <dbl>
## 1 BLAST100 58 4 0 37
## 2 BLAST97 50 3 0 46
## 3 CustomNBC 63 22 0 14
## 4 DADA2Spec 57 4 0 38
## 5 DADA2Tax 10 37 0 52
## 6 Kraken_0.0 0 0 0 99
## 7 Kraken_0.05 0 0 0 99
## 8 Kraken_0.1 0 0 0 99
## 9 MMSeqs2_100 57 4 0 38
## 10 MMSeqs2_97 61 6 0 32
## 11 Metabuli 24 3 0 72
## 12 Mothur 64 22 0 13
## 13 Qiime2 51 17 0 31
## 14 VSEARCH 60 20 0 19
scores <- scores |> rowwise() |> mutate(Recall = recall(TP, FN), Precision = precision(TP, FP),
f1 = f1(Precision, Recall), f0.5 = f0.5(Precision, Recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 14 × 10
## # Rowwise:
## Type TP FP TN FN Recall Precision f1 f0.5 accuracy
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BLAST100 58 4 0 37 0.611 0.935 0.739 0.845 0.586
## 2 BLAST97 50 3 0 46 0.521 0.943 0.671 0.812 0.505
## 3 CustomNBC 63 22 0 14 0.818 0.741 0.778 0.755 0.636
## 4 DADA2Spec 57 4 0 38 0.6 0.934 0.731 0.841 0.576
## 5 DADA2Tax 10 37 0 52 0.161 0.213 0.183 0.2 0.101
## 6 Kraken_0.0 0 0 0 99 0 0 0 0 0
## 7 Kraken_0.05 0 0 0 99 0 0 0 0 0
## 8 Kraken_0.1 0 0 0 99 0 0 0 0 0
## 9 MMSeqs2_100 57 4 0 38 0.6 0.934 0.731 0.841 0.576
## 10 MMSeqs2_97 61 6 0 32 0.656 0.910 0.762 0.845 0.616
## 11 Metabuli 24 3 0 72 0.25 0.889 0.390 0.588 0.242
## 12 Mothur 64 22 0 13 0.831 0.744 0.785 0.760 0.646
## 13 Qiime2 51 17 0 31 0.622 0.75 0.68 0.720 0.515
## 14 VSEARCH 60 20 0 19 0.759 0.75 0.755 0.752 0.606
CO1_score_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |> ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1)) + theme_minimal_hgrid() + theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + ggtitle('CO1 scores')
CO1_score_plot
b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, Recall, Precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
### Calculate Upset-based species sightings
type_list <- co1_data_vs_co1_100 |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |>
group_by(species) |>
summarize('Type' = list(Type))
a <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('CO1: Shared species') +
ylab('Species')
a
type_list <- co1_data_vs_co1_100 |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
filter(species == True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
b <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('CO1: Shared correct species') +
ylab('Species')
b
type_list <- co1_data_vs_co1_100 |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
filter(species != True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
c <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('CO1: Shared incorrect species') +
ylab('Species')
c
a + b + c & ylim(c(0, 20)) &
theme(
# Hide panel borders and remove grid lines
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
#panel.grid.major.y = element_line(),
# Change axis line
axis.line = element_line(colour = "black")
)
sixteenS_relative_plot / twelve_s_relative_plot / CO1_relative_plot
Let’s make without titles, but with a/b
(sixteenS_relative_plot + ggtitle('') + ylab(''))/ (twelve_s_relative_plot + ggtitle('')) / (CO1_relative_plot + ggtitle('')) +
plot_annotation(tag_levels = c('A','B')) +
plot_layout(guides = 'collect')
(sixteenS_score_plot +geom_point() + theme(axis.title.x = element_blank()))/ (twelveS_scoreS_plot + geom_point()) / (CO1_score_plot + geom_point())
twelve_exclusions <- data |> filter(str_starts(Subject, '12s_v010_final.fasta_Taxonomies.CountedFams.txt_')) |>
filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa')
twelve_exclusions_split <- twelve_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |>
# get rid of leftover non-subsetted databases
filter(!is.na(hit)) |>
separate(hit, into=c('Database', 'after'), sep='_subset')
twelve_exclusions_split_averaged <- twelve_exclusions_split |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 99 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
group_by(Type, Database) |>
summarise(mean_TP = mean(TP),
mean_FP = mean(FP),
mean_TN = mean(TN),
mean_FN = mean(FN)) |>
rowwise() |>
mutate(Recall = recall(mean_TP, mean_FN),
Precision = precision(mean_TP, mean_FP),
f1 = f1(Precision, Recall),
f0.5 = f0.5(Precision, Recall),
accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
twelve_exclusions_split_averaged <- twelve_exclusions_split_averaged |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%', # we switch from seventy INCLUDED to 30 EXCLUDED
Database == 'thirty' ~ '70%',
TRUE ~ Database))
f1_pl <- twelve_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
f0.5_pl <- twelve_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
Precision_pl <- twelve_exclusions_split_averaged |>
ggplot(aes(x = Type, y = Precision, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
Recall_pl <- twelve_exclusions_split_averaged |>
ggplot(aes(x = Type, y = Recall, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
(f1_pl / f0.5_pl / Precision_pl / Recall_pl) + plot_layout(guides = 'collect')
Lets zero in on the Precision and make boxplots with jitter dots
un_summarised_twelve <- twelve_exclusions_split |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 99 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
rowwise() |>
mutate(Recall = recall(TP, FN),
Precision = precision(TP, FP),
f1 = f1(Precision, Recall),
f0.5 = f0.5(Precision, Recall),
accuracy = accuracy(TP, FP, FN, TN)) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%',
Database == 'thirty' ~ '70%',
TRUE ~ Database))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_twelve |> group_by(Type, Database) |> mutate(best = max(mean(Precision))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Precision)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('Precision') +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_twelve |> group_by(Type, Database) |> mutate(best = max(mean(f0.5))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_twelve |> group_by(Type, Database) |> mutate(best = max(mean(Recall))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Recall)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_twelve |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Mothur')) |>
ggplot(aes(x=Database, y = Precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot() +
labs(fill='Type') +
ylab('Precision') +
theme_minimal()
false_positives <- un_summarised_twelve |>
filter(Type %in% c('BLAST100', 'Kraken_0.0','Metabuli', 'Kraken_0.1', 'MMSeqs2')) |>
ggplot(aes(x=Database, y = FP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('False positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
false_positives
true_positives <- un_summarised_twelve |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1', 'MMSeqs2')) |>
ggplot(aes(x=Database, y = TP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('True positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
true_positives
false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()
### Phylogenetic diversity
We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.
spec_summarised <- twelve_exclusions_split |>
group_by(Type, Query, Database, after) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%',
Database == 'thirty' ~ '70%',
TRUE ~ Database)) |>
filter(!is.na(species)) |>
summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
Let’s also do not all of the classifiers
spec_summarised |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1','MMSeqs2', 'Mothur')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
a <- spec_summarised |>
filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0','Metabuli', 'Kraken_0.1','MMSeqs2', 'Mothur')) |>
group_by(Database) |>
arrange(Database) |>
group_map(~aov(`Alpha diversity index` ~ Type, data=.))
names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 6544.683 1186.300
## Deg. of Freedom 5 54
##
## Residual standard error: 4.687059
## Estimated effects may be unbalanced
##
## $`50%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 10960.15 1365.50
## Deg. of Freedom 5 54
##
## Residual standard error: 5.028622
## Estimated effects may be unbalanced
##
## $`70%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 16836.95 655.90
## Deg. of Freedom 5 54
##
## Residual standard error: 3.485154
## Estimated effects may be unbalanced
groupslist <- list()
for(key in names(a)) {
print(key)
groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|>
as_tibble(rownames = 'Type') |>
select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
twelve_s_exclusions_fig <- spec_summarised |>
filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0','Metabuli', 'Kraken_0.1','MMSeqs2', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 4) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none")
twelve_s_exclusions_fig
twelve_spec_summarised <- spec_summarised
twelve_spec_summarised$gene <- '12S'
sixteen_exclusions <- data |> filter(str_starts(Subject, '16S_v04_final.fasta_Taxonomies.')) |>
filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa')
table(sixteen_exclusions$Subject)
##
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 1216
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 1153
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 1178
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 1222
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 1226
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 1197
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 1179
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 1181
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 1163
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 1192
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 1186
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 1243
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 1255
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 1238
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 1245
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 1263
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 1239
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 1245
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 1249
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 1248
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 1123
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 1162
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 1141
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 1127
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 1158
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 1129
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 1126
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 1149
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 1128
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 1144
sixteen_exclusions_split <- sixteen_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |>
# get rid of leftover non-subsetted databases
filter(!is.na(hit)) |>
separate(hit, into=c('Database', 'after'), sep='_subset')
sixteen_exclusions_split_averaged <- sixteen_exclusions_split |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 99 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
group_by(Type, Database) |>
summarise(mean_TP = mean(TP),
mean_FP = mean(FP),
mean_TN = mean(TN),
mean_FN = mean(FN)) |>
rowwise() |>
mutate(Recall = recall(mean_TP, mean_FN),
Precision = precision(mean_TP, mean_FP),
f1 = f1(Precision, Recall),
f0.5 = f0.5(Precision, Recall),
accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
sixteen_exclusions_split_averaged <- sixteen_exclusions_split_averaged |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%',
Database == 'thirty' ~ '70%',
TRUE ~ Database))
f1_pl <- sixteen_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
f0.5_pl <- sixteen_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
Precision_pl <- sixteen_exclusions_split_averaged |>
ggplot(aes(x = Type, y = Precision, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
Recall_pl <- sixteen_exclusions_split_averaged |>
ggplot(aes(x = Type, y = Recall, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
(f1_pl / f0.5_pl / Precision_pl / Recall_pl) + plot_layout(guides = 'collect')
Lets zero in on the Precision and make boxplots with jitter dots
un_summarised_sixteen <- sixteen_exclusions_split |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 99 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
rowwise() |>
mutate(Recall = recall(TP, FN),
Precision = precision(TP, FP),
f1 = f1(Precision, Recall),
f0.5 = f0.5(Precision, Recall),
accuracy = accuracy(TP, FP, FN, TN)) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%',
Database == 'thirty' ~ '70%',
TRUE ~ Database))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_sixteen |> group_by(Type, Database) |> mutate(best = max(mean(Precision, na.rm=TRUE))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Precision)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('Precision') +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_sixteen |> group_by(Type, Database) |> mutate(best = max(mean(f0.5, na.rm=TRUE))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_sixteen |> group_by(Type, Database) |> mutate(best = max(mean(Recall))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Recall)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_sixteen |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Mothur')) |>
ggplot(aes(x=Database, y = Precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot() +
labs(fill='Type') +
ylab('Precision') +
theme_minimal()
false_positives <- un_summarised_sixteen |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1', 'MMSeqs2')) |>
ggplot(aes(x=Database, y = FP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('False positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
false_positives
true_positives <- un_summarised_sixteen |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1', 'MMSeqs2')) |>
ggplot(aes(x=Database, y = TP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('True positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
true_positives
false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()
### Phylogenetic diversity
We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.
spec_summarised <- sixteen_exclusions_split |>
group_by(Type, Query, Database, after) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%',
Database == 'thirty' ~ '70%',
TRUE ~ Database)) |>
filter(!is.na(species)) |>
summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
Let’s also do not all of the classifiers
spec_summarised |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1','MMSeqs2', 'Mothur')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
a <- spec_summarised |>
filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1','MMSeqs2', 'Mothur')) |>
group_by(Database) |>
arrange(Database) |>
group_map(~aov(`Alpha diversity index` ~ Type, data=.))
names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 9595.627 371.322
## Deg. of Freedom 5 53
##
## Residual standard error: 2.6469
## Estimated effects may be unbalanced
##
## $`50%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 15002.08 1554.90
## Deg. of Freedom 5 54
##
## Residual standard error: 5.366046
## Estimated effects may be unbalanced
##
## $`70%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 18225.48 700.70
## Deg. of Freedom 5 54
##
## Residual standard error: 3.602211
## Estimated effects may be unbalanced
library(agricolae)
groupslist <- list()
for(key in names(a)) {
print(key)
groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|>
as_tibble(rownames = 'Type') |>
select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
sixteen_s_exclusions_fig <- spec_summarised |>
filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1','MMSeqs2', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 4) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none")
sixteen_s_exclusions_fig
sixteen_spec_summarised <- spec_summarised
sixteen_spec_summarised$gene <- '16S'
co1_exclusions <- data |> filter(str_starts(Subject, 'c01_v03_final.fasta_Taxonomies.')) |>
filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa')
table(co1_exclusions$Subject)
##
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 1157
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 1181
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 1159
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 1208
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 1150
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 1207
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 1194
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 1224
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 1151
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 1196
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 1269
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 1241
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 1237
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 1212
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 1253
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 1245
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 1208
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 1254
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 1266
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 1248
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 1155
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 1176
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 1158
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 1101
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 1100
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 1132
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 1154
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 1115
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 1116
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 1109
co1_exclusions_split <- co1_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |>
# get rid of leftover non-subsetted databases
filter(!is.na(hit)) |>
separate(hit, into=c('Database', 'after'), sep='_subset')
co1_exclusions_split_averaged <- co1_exclusions_split |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 99 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
group_by(Type, Database) |>
summarise(mean_TP = mean(TP),
mean_FP = mean(FP),
mean_TN = mean(TN),
mean_FN = mean(FN)) |>
rowwise() |>
mutate(Recall = recall(mean_TP, mean_FN),
Precision = precision(mean_TP, mean_FP),
f1 = f1(Precision, Recall),
f0.5 = f0.5(Precision, Recall),
accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
co1_exclusions_split_averaged <- co1_exclusions_split_averaged |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%',
Database == 'thirty' ~ '70%',
TRUE ~ Database))
f1_pl <- co1_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
f0.5_pl <- co1_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
Precision_pl <- co1_exclusions_split_averaged |>
ggplot(aes(x = Type, y = Precision, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
Recall_pl <- co1_exclusions_split_averaged |>
ggplot(aes(x = Type, y = Recall, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
(f1_pl / f0.5_pl / Precision_pl / Recall_pl) + plot_layout(guides = 'collect')
Lets zero in on the Precision and make boxplots with jitter dots
un_summarised_co1 <- co1_exclusions_split |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 99 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
rowwise() |>
mutate(Recall = recall(TP, FN),
Precision = precision(TP, FP),
f1 = f1(Precision, Recall),
f0.5 = f0.5(Precision, Recall),
accuracy = accuracy(TP, FP, FN, TN)) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%',
Database == 'thirty' ~ '70%',
TRUE ~ Database))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_co1 |> group_by(Type, Database) |> mutate(best = max(mean(Precision, na.rm=TRUE))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Precision)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('Precision') +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_co1 |> group_by(Type, Database) |> mutate(best = max(mean(f0.5, na.rm=TRUE))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_co1 |> group_by(Type, Database) |> mutate(best = max(mean(Recall))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Recall)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_co1 |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Mothur')) |>
ggplot(aes(x=Database, y = Precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot() +
labs(fill='Type') +
ylab('Precision') +
theme_minimal()
false_positives <- un_summarised_co1 |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1', 'MMSeqs2')) |>
ggplot(aes(x=Database, y = FP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('False positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
false_positives
true_positives <- un_summarised_co1 |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1', 'MMSeqs2')) |>
ggplot(aes(x=Database, y = TP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('True positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
true_positives
false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()
### Phylogenetic diversity
We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.
spec_summarised <- co1_exclusions_split |>
group_by(Type, Query, Database, after) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '30%',
Database == 'thirty' ~ '70%',
TRUE ~ Database)) |>
filter(!is.na(species)) |>
summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
Let’s also do not all of the classifiers
spec_summarised |>
filter(Type %in% c('BLAST97', 'Kraken_0.0', 'Metabuli', 'MMSeqs2_97', 'Mothur')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
a <- spec_summarised |>
filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |>
group_by(Database) |>
arrange(Database) |>
group_map(~aov(`Alpha diversity index` ~ Type, data=.))
names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 5882.075 1088.900
## Deg. of Freedom 3 36
##
## Residual standard error: 5.499747
## Estimated effects may be unbalanced
##
## $`50%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 1677.875 503.900
## Deg. of Freedom 3 36
##
## Residual standard error: 3.741286
## Estimated effects may be unbalanced
##
## $`70%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 14061.93 825.00
## Deg. of Freedom 5 54
##
## Residual standard error: 3.90868
## Estimated effects may be unbalanced
library(agricolae)
groupslist <- list()
for(key in names(a)) {
print(key)
groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|>
as_tibble(rownames = 'Type') |>
select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
co1_s_exclusions_fig <- spec_summarised |>
filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 4) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none")
co1_s_exclusions_fig
co1_spec_summarised <- spec_summarised
co1_spec_summarised$gene <- 'CO1'
both <- rbind(twelve_spec_summarised, sixteen_spec_summarised, co1_spec_summarised)
both_fig <- both |>
filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
mutate(Type = gsub('Kraken', 'Kraken2', Type)) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 3) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none") +
facet_wrap(~gene+Database, nrow = 3)
both_fig
my_save_plot(both_fig, 'exclusions_fig')
I want to remove 50%, there’s too much going on. I also want to use patchwork
top_12 <- both |>
filter(gene == '12S') |>
filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
filter(Database != '50%') |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 3) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none") +
ylab('Number of identified species')+
facet_wrap(~Database, nrow = 1) +
theme(panel.spacing = unit(2, "lines")) +
ylim(c(10,90))
middle_16 <- both |>
filter(gene == '16S') |>
filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
filter(Database != '50%') |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 3) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none") +
ylab('Number of identified species')+
facet_wrap(~Database, nrow = 1) +
theme(panel.spacing = unit(2, "lines"))+
ylim(c(10,90))
bottom_coi <- both |>
filter(gene == 'CO1') |>
filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
filter(Database != '50%') |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 3) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none") +
ylab('Number of identified species')+
xlab('Classifier')+
facet_wrap(~Database, nrow = 1) +
theme(panel.spacing = unit(2, "lines"))+
ylim(c(10,90))
without50_fig <- (top_12 + theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) + ylab(NULL)) / (middle_16 + theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) ) / (bottom_coi + ylab(NULL)) + plot_annotation(tag_levels = 'A') +
plot_layout(guides = 'collect')
without50_fig
my_save_plot(without50_fig, 'exclusions_without50_fig')
Hmmm I don’t like this.
Let’s do it different:
without_50_redesigned_top <- both |>
filter(gene == '12S') |>
filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
mutate(Type = gsub('Kraken', 'Kraken2', Type)) |>
filter(Database != '50%') |>
ggplot(aes(x = Database, y = `Alpha diversity index`, fill=Type, group = Database )) +
geom_boxplot(outlier.shape=NA, show.legend = FALSE) +
facet_wrap(~Type, nrow=1) +
ylab('Number of identified species') +
xlab('Exclusion percentage') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)
without_50_redesigned_bottom <- both |>
filter(gene == 'CO1') |>
filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
mutate(Type = gsub('Kraken', 'Kraken2', Type)) |>
filter(Database != '50%') |>
ggplot(aes(x = Database, y = `Alpha diversity index`, fill=Type, group = Database )) +
geom_boxplot(outlier.shape=NA, show.legend = FALSE) +
facet_wrap(~Type, nrow=1) +
ylab('Number of identified species') +
xlab('Exclusion percentage') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)
both_figs <- (without_50_redesigned_top + theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())) / without_50_redesigned_bottom + plot_annotation(tag_levels = 'A')
my_save_plot(both_figs, 'exclusions_without50_fig_redesigned')
Let’s make the tables too:
tw <- twelve_exclusions_split_averaged
tw$Gene <- '12S'
six <- sixteen_exclusions_split_averaged
six$Gene <- '16S'
co <- co1_exclusions_split_averaged
co$Gene <- 'CO1'
both <- rbind(tw, six, co) |> relocate(Gene)
both |> my_save_table('all_exclusion_databases_scores')
both |> group_by(Gene, Database) |> arrange(desc(f1)) |> slice(1) |> ungroup() |>
select(Gene, Type, Database, accuracy, Precision, Recall, f1, f0.5, accuracy) |>
my_save_table('averaged_top1_exclusion_database_scores')